In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
In [2]:
data= sns.load_dataset("titanic")

Objectif: We want to predict the feature "survived"

EDA¶

In [3]:
#eda
print("--------------------------------")
display(data.shape)
print("--------------------------------")
display(data.info())
print("--------------------------------")
display(data.isnull().sum()[data.isnull().sum()>0])
print("--------------------------------")
display(data.describe(include="all"))
print("--------------------------------")
display(data.corr().style.background_gradient(cmap='coolwarm'))
--------------------------------
(891, 15)
--------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB
None
--------------------------------
age            177
embarked         2
deck           688
embark_town      2
dtype: int64
--------------------------------
survived pclass sex age sibsp parch fare embarked class who adult_male deck embark_town alive alone
count 891.000000 891.000000 891 714.000000 891.000000 891.000000 891.000000 889 891 891 891 203 889 891 891
unique NaN NaN 2 NaN NaN NaN NaN 3 3 3 2 7 3 2 2
top NaN NaN male NaN NaN NaN NaN S Third man True C Southampton no True
freq NaN NaN 577 NaN NaN NaN NaN 644 491 537 537 59 644 549 537
mean 0.383838 2.308642 NaN 29.699118 0.523008 0.381594 32.204208 NaN NaN NaN NaN NaN NaN NaN NaN
std 0.486592 0.836071 NaN 14.526497 1.102743 0.806057 49.693429 NaN NaN NaN NaN NaN NaN NaN NaN
min 0.000000 1.000000 NaN 0.420000 0.000000 0.000000 0.000000 NaN NaN NaN NaN NaN NaN NaN NaN
25% 0.000000 2.000000 NaN 20.125000 0.000000 0.000000 7.910400 NaN NaN NaN NaN NaN NaN NaN NaN
50% 0.000000 3.000000 NaN 28.000000 0.000000 0.000000 14.454200 NaN NaN NaN NaN NaN NaN NaN NaN
75% 1.000000 3.000000 NaN 38.000000 1.000000 0.000000 31.000000 NaN NaN NaN NaN NaN NaN NaN NaN
max 1.000000 3.000000 NaN 80.000000 8.000000 6.000000 512.329200 NaN NaN NaN NaN NaN NaN NaN NaN
--------------------------------
C:\Users\User\AppData\Local\Temp\ipykernel_28720\1412386682.py:11: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.
  display(data.corr().style.background_gradient(cmap='coolwarm'))
  survived pclass age sibsp parch fare adult_male alone
survived 1.000000 -0.338481 -0.077221 -0.035322 0.081629 0.257307 -0.557080 -0.203367
pclass -0.338481 1.000000 -0.369226 0.083081 0.018443 -0.549500 0.094035 0.135207
age -0.077221 -0.369226 1.000000 -0.308247 -0.189119 0.096067 0.280328 0.198270
sibsp -0.035322 0.083081 -0.308247 1.000000 0.414838 0.159651 -0.253586 -0.584471
parch 0.081629 0.018443 -0.189119 0.414838 1.000000 0.216225 -0.349943 -0.583398
fare 0.257307 -0.549500 0.096067 0.159651 0.216225 1.000000 -0.182024 -0.271832
adult_male -0.557080 0.094035 0.280328 -0.253586 -0.349943 -0.182024 1.000000 0.404744
alone -0.203367 0.135207 0.198270 -0.584471 -0.583398 -0.271832 0.404744 1.000000
In [4]:
import plotly.express as px
for elem in data.columns:
    if elem!='survived':
        fig = px.bar(data, x='survived', y=elem)
        fig.show()

we decide to remove the feature "deck" with other that are duplicate lile "alive"¶

Preprocessing¶

In [5]:
data.head(3)
Out[5]:
survived pclass sex age sibsp parch fare embarked class who adult_male deck embark_town alive alone
0 0 3 male 22.0 1 0 7.2500 S Third man True NaN Southampton no False
1 1 1 female 38.0 1 0 71.2833 C First woman False C Cherbourg yes False
2 1 3 female 26.0 0 0 7.9250 S Third woman False NaN Southampton yes True
In [6]:
data.drop(["deck","alive"],axis=1,inplace=True)
In [7]:
data.isna().sum()[data.isna().sum()>0]
Out[7]:
age            177
embarked         2
embark_town      2
dtype: int64
In [8]:
data.age = data.age.fillna(data.age.mode()[0])
data.embarked = data.embarked.fillna(data.embarked.mode()[0])
data.embark_town = data.embark_town.fillna(data.embark_town.mode()[0])
In [9]:
data.head(2)
Out[9]:
survived pclass sex age sibsp parch fare embarked class who adult_male embark_town alone
0 0 3 male 22.0 1 0 7.2500 S Third man True Southampton False
1 1 1 female 38.0 1 0 71.2833 C First woman False Cherbourg False
In [10]:
data.sex=data.sex.map({"male":0,"female":1})
data.embarked=data.embarked.map({"S":0,"C":1,"Q":2})
data["class"]=data["class"].map({'Third':0, 'First':1, 'Second':2})
data.who=data.who.map({"man":0,"woman":1,"child":2})
data.adult_male=data.adult_male.map({True:0,False:1})
data.embark_town=data.embark_town.map({'Southampton':0, 'Cherbourg':1, 'Queenstown':2})
data.alone=data.alone.map({True:0,False:1})
In [11]:
data.age=data.age.astype(int)
data.fare= data.fare.round().astype(int)
data["class"]=data["class"].astype(int)
In [12]:
data.head()
Out[12]:
survived pclass sex age sibsp parch fare embarked class who adult_male embark_town alone
0 0 3 0 22 1 0 7 0 0 0 0 0 1
1 1 1 1 38 1 0 71 1 1 1 1 1 1
2 1 3 1 26 0 0 8 0 0 1 1 0 0
3 1 1 1 35 1 0 53 0 1 1 1 0 1
4 0 3 0 35 0 0 8 0 0 0 0 0 0

split the data from the target¶

In [13]:
X= data.drop(["survived"],axis=1)
y= data['survived']

normalization of data¶

In [14]:
X= (X - X.min()) / (X.max() - X.min())
In [15]:
X
Out[15]:
pclass sex age sibsp parch fare embarked class who adult_male embark_town alone
0 1.0 0.0 0.2750 0.125 0.000000 0.013672 0.0 0.0 0.0 0.0 0.0 1.0
1 0.0 1.0 0.4750 0.125 0.000000 0.138672 0.5 0.5 0.5 1.0 0.5 1.0
2 1.0 1.0 0.3250 0.000 0.000000 0.015625 0.0 0.0 0.5 1.0 0.0 0.0
3 0.0 1.0 0.4375 0.125 0.000000 0.103516 0.0 0.5 0.5 1.0 0.0 1.0
4 1.0 0.0 0.4375 0.000 0.000000 0.015625 0.0 0.0 0.0 0.0 0.0 0.0
... ... ... ... ... ... ... ... ... ... ... ... ...
886 0.5 0.0 0.3375 0.000 0.000000 0.025391 0.0 1.0 0.0 0.0 0.0 0.0
887 0.0 1.0 0.2375 0.000 0.000000 0.058594 0.0 0.5 0.5 1.0 0.0 0.0
888 1.0 1.0 0.3000 0.125 0.333333 0.044922 0.0 0.0 0.5 1.0 0.0 1.0
889 0.0 0.0 0.3250 0.000 0.000000 0.058594 0.5 0.5 0.0 0.0 0.5 0.0
890 1.0 0.0 0.4000 0.000 0.000000 0.015625 1.0 0.0 0.0 0.0 1.0 0.0

891 rows × 12 columns

Machine Learning Modelization¶

In [18]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train,y_test = train_test_split(X,y,test_size=0.33)
In [23]:
import tensorflow as tf
from tensorflow import keras
model = keras.Sequential([keras.layers.Flatten(input_shape=(len(X.columns),)),
                          keras.layers.Dense(128,activation=tf.nn.relu),
                          keras.layers.Dense(256,activation=tf.nn.relu),
                          keras.layers.Dense(256,activation=tf.nn.relu),
                          keras.layers.Dense(1,activation=tf.nn.sigmoid)])
In [24]:
model.compile(optimizer="adam", loss="binary_crossentropy",metrics=['accuracy'])
In [25]:
model.fit(X_train,y_train,epochs=50,batch_size=1)
test_loss,test_acc=model.evaluate(X_test, y_test)
Epoch 1/50
596/596 [==============================] - 1s 857us/step - loss: 0.4914 - accuracy: 0.7718
Epoch 2/50
596/596 [==============================] - 1s 862us/step - loss: 0.4399 - accuracy: 0.8121
Epoch 3/50
596/596 [==============================] - 1s 863us/step - loss: 0.4400 - accuracy: 0.8171
Epoch 4/50
596/596 [==============================] - 1s 851us/step - loss: 0.4273 - accuracy: 0.8138
Epoch 5/50
596/596 [==============================] - 1s 844us/step - loss: 0.4189 - accuracy: 0.8289
Epoch 6/50
596/596 [==============================] - 1s 857us/step - loss: 0.4167 - accuracy: 0.8305
Epoch 7/50
596/596 [==============================] - 1s 863us/step - loss: 0.4153 - accuracy: 0.8188
Epoch 8/50
596/596 [==============================] - 1s 868us/step - loss: 0.4121 - accuracy: 0.8289
Epoch 9/50
596/596 [==============================] - 1s 857us/step - loss: 0.4205 - accuracy: 0.8087
Epoch 10/50
596/596 [==============================] - 1s 876us/step - loss: 0.4071 - accuracy: 0.8289
Epoch 11/50
596/596 [==============================] - 1s 904us/step - loss: 0.4063 - accuracy: 0.8255
Epoch 12/50
596/596 [==============================] - 1s 908us/step - loss: 0.4083 - accuracy: 0.8255
Epoch 13/50
596/596 [==============================] - 1s 872us/step - loss: 0.4091 - accuracy: 0.8221
Epoch 14/50
596/596 [==============================] - 1s 859us/step - loss: 0.4025 - accuracy: 0.8272
Epoch 15/50
596/596 [==============================] - 1s 856us/step - loss: 0.4057 - accuracy: 0.8372
Epoch 16/50
596/596 [==============================] - 1s 858us/step - loss: 0.4000 - accuracy: 0.8356
Epoch 17/50
596/596 [==============================] - 1s 858us/step - loss: 0.4008 - accuracy: 0.8289
Epoch 18/50
596/596 [==============================] - 1s 890us/step - loss: 0.4023 - accuracy: 0.8389
Epoch 19/50
596/596 [==============================] - 1s 853us/step - loss: 0.3947 - accuracy: 0.8423
Epoch 20/50
596/596 [==============================] - 1s 862us/step - loss: 0.3960 - accuracy: 0.8289
Epoch 21/50
596/596 [==============================] - 1s 864us/step - loss: 0.3976 - accuracy: 0.8406
Epoch 22/50
596/596 [==============================] - 1s 877us/step - loss: 0.3962 - accuracy: 0.8389
Epoch 23/50
596/596 [==============================] - 1s 857us/step - loss: 0.3913 - accuracy: 0.8406
Epoch 24/50
596/596 [==============================] - 1s 858us/step - loss: 0.3946 - accuracy: 0.8406
Epoch 25/50
596/596 [==============================] - 1s 858us/step - loss: 0.3887 - accuracy: 0.8372
Epoch 26/50
596/596 [==============================] - 1s 858us/step - loss: 0.3825 - accuracy: 0.8456
Epoch 27/50
596/596 [==============================] - 1s 864us/step - loss: 0.3826 - accuracy: 0.8523
Epoch 28/50
596/596 [==============================] - 1s 857us/step - loss: 0.3837 - accuracy: 0.8440
Epoch 29/50
596/596 [==============================] - 1s 857us/step - loss: 0.3852 - accuracy: 0.8372
Epoch 30/50
596/596 [==============================] - 1s 865us/step - loss: 0.3838 - accuracy: 0.8440
Epoch 31/50
596/596 [==============================] - 1s 873us/step - loss: 0.3840 - accuracy: 0.8406
Epoch 32/50
596/596 [==============================] - 1s 874us/step - loss: 0.3789 - accuracy: 0.8406
Epoch 33/50
596/596 [==============================] - 1s 874us/step - loss: 0.3812 - accuracy: 0.8456
Epoch 34/50
596/596 [==============================] - 1s 857us/step - loss: 0.3764 - accuracy: 0.8456
Epoch 35/50
596/596 [==============================] - 1s 857us/step - loss: 0.3759 - accuracy: 0.8440
Epoch 36/50
596/596 [==============================] - 1s 857us/step - loss: 0.3781 - accuracy: 0.8423
Epoch 37/50
596/596 [==============================] - 1s 870us/step - loss: 0.3704 - accuracy: 0.8523
Epoch 38/50
596/596 [==============================] - 1s 875us/step - loss: 0.3725 - accuracy: 0.8507
Epoch 39/50
596/596 [==============================] - 1s 873us/step - loss: 0.3645 - accuracy: 0.8440
Epoch 40/50
596/596 [==============================] - 1s 892us/step - loss: 0.3775 - accuracy: 0.8440
Epoch 41/50
596/596 [==============================] - 1s 891us/step - loss: 0.3723 - accuracy: 0.8490
Epoch 42/50
596/596 [==============================] - 1s 943us/step - loss: 0.3682 - accuracy: 0.8540
Epoch 43/50
596/596 [==============================] - 1s 932us/step - loss: 0.3642 - accuracy: 0.8523
Epoch 44/50
596/596 [==============================] - 1s 1ms/step - loss: 0.3664 - accuracy: 0.8440
Epoch 45/50
596/596 [==============================] - 1s 1ms/step - loss: 0.3604 - accuracy: 0.8456
Epoch 46/50
596/596 [==============================] - 1s 878us/step - loss: 0.3692 - accuracy: 0.8490
Epoch 47/50
596/596 [==============================] - 1s 883us/step - loss: 0.3586 - accuracy: 0.8523
Epoch 48/50
596/596 [==============================] - 1s 887us/step - loss: 0.3593 - accuracy: 0.8540
Epoch 49/50
596/596 [==============================] - 1s 877us/step - loss: 0.3519 - accuracy: 0.8591
Epoch 50/50
596/596 [==============================] - 1s 872us/step - loss: 0.3581 - accuracy: 0.8490
10/10 [==============================] - 0s 2ms/step - loss: 0.4679 - accuracy: 0.8136
In [32]:
print('test_acc -> ',test_acc)
print('test_loss -> ',test_loss)
test_acc ->  0.8135592937469482
test_loss ->  0.46787524223327637

Accuracy prediction survival: 81%

In [31]:
from sklearn.metrics import confusion_matrix

prediction = pd.DataFrame(model.predict(X_test).round().astype(int))
confusion_matrix(y_test,prediction)
10/10 [==============================] - 0s 804us/step
Out[31]:
array([[169,  20],
       [ 35,  71]], dtype=int64)
In [ ]: